// cd /projects/hsieh_project/proj_201809/code_1_data/
// qstata --dofile=data_2_sales_cityind_sel.do --statatype=mp --cpucount=5 &

clear all
capture noi program drop _all
cd /projects/

di "Started at $S_TIME $S_DATE"

global rev_date: display %tdYYNNDD date("$S_DATE", "DMY")
di "${rev_date}"

global dir_proj "/projects/hsieh_project/proj_201809/"

global dir_data "${dir_proj}/data/"

global year1 = 1977
global year2 = 2013

global gl_perc = "10 5 1"
global gl_city = "fips msa czone msa1983 msa1983cz"

//==============================================================================
/* 
Function that 
1) calculates Sato-Vartia weight for city-industry
2) generates variables of initial and terminal employment
3) calculates HHI
The final data set appends data for different definitions of cities together.
The definition is denoted by city_code.
*/

capture program drop gen_sel
program gen_sel

/* Iteration over all definitions of cities */
foreach i_city in $gl_city {

	di "`i_city'"
	//local i_city = "fips"
	use "${dir_data}/sales_cityind_sum_`i_city'", clear
	rename `i_city' city
	// Keep selected years for sales data
    merge m:1 ch_ind using ${dir_data}/cw_year_sales //, assert(match) nogen
	tab ch_ind if _merge != 3
	keep if _merge == 3
	drop _merge
	
    rename year year_orig
    gen year = year_orig
    keep if year == year_min | year == year_max
    replace year = ${year1} if year == year_min
    replace year = ${year2} if year == year_max
	keep if inlist(year, ${year1}, ${year2})
	
	qui count if year == ${year2}
	if `r(N)' == 0 {
		di "No `i_city', ${year2}"
		continue
	}
    
	/*
	----------------------------------------------------------------------------
	   Calculate Sato-Vartia weight for each industry in each city
	   w_sv: Drop city-industry where employment does not change
	   w_sva: Use city-industry employment as numerator when employment does not change
	   w_sva is the preferred version of Sato-Vartia weight
	*/
    /*
	/* Calculate the numerator */
	sort city ch_ind year
	by city ch_ind: gen sv_num1 = emp_cind[2] - emp_cind[1]
	by city ch_ind: gen sv_num2 = ln(emp_cind[2]) - ln(emp_cind[1])
	gen sv_num = sv_num1 / sv_num2
	
	by city ch_ind: gen sv_emp_same = 1 if emp_cind[2] == emp_cind[1] & _N == 2
	gen sv_num_alt = sv_num 
	replace sv_num_alt = emp_cind if sv_emp_same == 1
	
	/* Calculate the denominator */
	sort city year
	by city year: egen sv_den = total(sv_num)
	by city year: egen sv_den_alt = total(sv_num_alt)
	
	/* Calculate the weight */
	gen w_sv = sv_num / sv_den
	gen w_sva = sv_num_alt / sv_den_alt
	
	/* Quality check: Should sum up to 1 */
	by city year: egen tot_w_sv = total(w_sv)
	replace tot_w_sv = round(tot_w_sv, 0.01)
	tab year tot_w_sv
	drop tot_w_sv
	
	by city year: egen tot_w_sva = total(w_sva)
	replace tot_w_sva = round(tot_w_sva, 0.01)
	tab year tot_w_sva
	drop tot_w_sva
	
	drop sv_emp_same
    */
	/*
	----------------------------------------------------------------------------
	   Calculate city employment in the initial and terminal years
	*/

	sort city year
	by city year: egen sales_city = total(sales_cind)
	by city: gen sales_city_${year1} = sales_city if year == ${year1}
	by city: gen sales_city_${year2} = sales_city if year == ${year2}
	sort city sales_city_${year1}
	by city: replace sales_city_${year1} = sales_city_${year1}[1]
	sort city sales_city_${year2}
	by city: replace sales_city_${year2} = sales_city_${year2}[1]

	/*
	----------------------------------------------------------------------------
	   Calculate total employment by top industry and top city-industry firms
	   in cities
	*/
	
	sort city year
	foreach i_perc in $gl_perc {
		by city year: egen sales_c_ind_`i_perc' = total(sales_ind_`i_perc')
	}
	by city year: egen sales_c_cind_t1 = total(sales_cind_t1)

	/*
	----------------------------------------------------------------------------
	   Calculate HHI for city-industry
	   hhi_cind: The default definition (sum of share^2)
	   hhin_cind: Normalized HHI (legacy)
	   hhir_cind: HHI adjusted by numbers of firms (legacy)
	*/
	
	capture noi gen double hhi_cind = sales2_cindf / sales_cind^2
	if _rc != 0 {
		gen double hhi_cind2 = sales2_cindf / sales_cind^2
		qui count if hhi_cind2 != hhi_cind
		if `r(N)' != 0 {
			di "HHI Quality Issue!"
		}
	}
	gen hhin_cind = (hhi_cind * n_cind - 1) / (n_cind - 1)
	replace hhin_cind = 1 if n_cind == 1
	gen hhir_cind = hhi_cind * n_cind
	
    rename hhi_cind sales_hhi_cind
    rename hhin_cind sales_hhin_cind
    rename hhir_cind sales_hhir_cind
	
	gen city_code = "`i_city'"
	gen year1 = ${year1}
	gen year2 = ${year2}
	
	append using ${ds_out}
	save "${ds_out}", replace

}
end

global ds_out "${dir_data}/sales_cityind_sum_sel"
clear
save ${ds_out}, replace emptyok

global year1 = 1977
global year2 = 2013
gen_sel

di "Ended at $S_DATE $S_TIME"
// End of do file
